library(dplyr)
library(readr)
library(ggplot2)
library(openxlsx)
library(knitr)
library(tibble)
library(stringr)
library(stringi)
library(readxl)
library(lubridate)
library(shiny)
library(plotly)
library(ruODK)

1 Loading the Data and Removal of Training Data

# Unzip and extract ODK data from ODK zip
df <- export_load_from_odk(params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 02-TIMCI-SPA-CGEI 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1
# Formatting dates from integer (in ms) to time stamp
df$start <- format_date_ms(df$start)
df$end <- format_date_ms(df$end)

head(df)
instance.ID event node start end latitude longitude accuracy old.value new.value
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 form start NA 2021-02-02 15:18:42 NA NA NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/front_page 2021-02-02 15:18:42 2021-02-02 15:18:43 NA NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 form resume NA 2021-02-09 09:09:37 NA NA NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 jump NA 2021-02-09 09:09:37 2021-02-09 09:09:51 NA NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/g5 2021-02-09 09:09:48 2021-02-09 09:09:51 NA NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/a1 2021-02-09 09:09:51 2021-02-09 09:09:57 NA NA NA NA NA
# filtering for events that occurred after 18th July 21
#df <- subset(df, as.Date(start) > as.Date("18.07.2021", "%d.%m.%Y"))

2 Deriving New Features

2.1 Time Spent per Event

# subtracting end from start date
df$time_spent = round(as.numeric(df$end - df$start))

2.2 Question

# splitting the node strings so that only the question name remains 
df$question = sapply(df$node, create_question)

2.3 Question Decoded

df <- decode_question(df, df$question, params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 02-TIMCI-SPA-CGEI 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1

2.4 Categorical Answers Decoded

df <- decode_categories(df, params$svc)
## <ruODK settings>
##   Default ODK Central Project ID: 2 
##   Default ODK Central Form ID: 02-TIMCI-SPA-CGEI 
##   Default ODK Central URL: https://timicodktest.smartforest.de 
##   Default ODK Central Username: lucas.silbernagel@swisstph.ch 
##   Default ODK Central Password: run ruODK::get_default_pw() to show 
##   Default ODK Central Passphrase: run ruODK::get_default_pp() to show 
##   Default Time Zone: Europe/Berlin 
##   Default ODK Central Version: 1.1 
##   Default HTTP GET retries: 3 
##   Verbose messages: TRUE 
##   Test ODK Central Project ID:  
##   Test ODK Central Form ID:  
##   Test ODK Central Form ID (ZIP tests):  
##   Test ODK Central Form ID (Attachment tests):  
##   Test ODK Central Form ID (Parsing tests):  
##   Test ODK Central Form ID (WKT tests):  
##   Test ODK Central URL:  
##   Test ODK Central Username:  
##   Test ODK Central Password: run ruODK::get_test_pw() to show 
##   Test ODK Central Passphrase: run ruODK::get_test_pp() to show 
##   Test ODK Central Version: 1.1

2.5 Time until a Response was Changed + Stream of Answer Changes

df <- df %>%
# bringing the data in the right order   
  arrange(instance.ID, node, start) %>%
# adding two empty columns to store the new features in
  add_column(time_till_change=NA) %>%
  add_column(changed_from=NA)

# iterating over the df and computing the time it took until an answer was changed + adding what the question was before 
for (i in 1:nrow(df)){
  if (df$old.value[i]==df$new.value[i-1] && !is.na(df$old.value[i]) && !is.na(df$new.value[i-1]) ){
    df$time_till_change[i] <- round(as.numeric(df$start[i]-df$end[i-1]))
  } else{
    next
  }
}

2.6 Preview and Summary of the Final Data

head(df)
instance.ID event node start end latitude longitude accuracy old.value new.value time_spent question question_decoded new_value_decoded old_value_decoded time_till_change changed_from
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/a1 2021-02-09 09:09:51 2021-02-09 09:09:57 NA NA NA NA NA 6 a1 Participant identification NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/a1 2021-02-09 09:10:16 2021-02-09 09:10:52 NA NA NA NA NA 36 a1 Participant identification NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 question /data/a1/a1_a_4 2021-02-09 09:10:16 2021-02-09 09:10:52 NA NA NA NA K-F019-P0106 36 a1_a_4 Please scan the participant’s QR code K-F019-P0106 NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/b1 2021-02-09 09:09:57 2021-02-09 09:10:03 NA NA NA NA NA 6 b1 Facility identification NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 group questions /data/b1 2021-02-09 09:10:08 2021-02-09 09:10:16 NA NA NA NA NA 8 b1 Facility identification NA NA NA NA
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 question /data/b1/b1_4 2021-02-09 09:10:08 2021-02-09 09:10:16 NA NA NA NA Mbour 8 b1_4 Please select the current district Mbour NA NA NA
summary(df)
##  instance.ID           event               node               start                    
##  Length:77          Length:77          Length:77          Min.   :2021-02-02 15:18:42  
##  Class :character   Class :character   Class :character   1st Qu.:2021-02-09 09:11:02  
##  Mode  :character   Mode  :character   Mode  :character   Median :2021-02-09 09:45:45  
##                                                           Mean   :2021-04-10 07:29:35  
##                                                           3rd Qu.:2021-06-21 13:26:01  
##                                                           Max.   :2021-06-21 13:26:23  
##                                                                                        
##       end                      latitude       longitude      accuracy       old.value     
##  Min.   :2021-02-02 15:18:43   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  1st Qu.:2021-02-09 09:11:52   NA's:77        NA's:77        NA's:77        NA's:77       
##  Median :2021-02-09 09:17:39                                                              
##  Mean   :2021-04-11 11:04:47                                                              
##  3rd Qu.:2021-06-21 13:26:06                                                              
##  Max.   :2021-06-21 13:26:23                                                              
##  NA's   :10                                                                               
##   new.value           time_spent       question         question_decoded   new_value_decoded 
##  Length:77          Min.   :  2.00   Length:77          Length:77          Length:77         
##  Class :character   1st Qu.:  5.00   Class :character   Class :character   Class :character  
##  Mode  :character   Median :  8.00   Mode  :character   Mode  :character   Mode  :character  
##                     Mean   : 33.75                                                           
##                     3rd Qu.: 20.00                                                           
##                     Max.   :236.00                                                           
##                     NA's   :10                                                               
##  old_value_decoded time_till_change changed_from  
##  Mode:logical      Mode:logical     Mode:logical  
##  NA's:77           NA's:77          NA's:77       
##                                                   
##                                                   
##                                                   
##                                                   
## 

3 General Information about the Data

no_inst = length(unique(df$instance.ID))
no_event =  nrow(df)
earliest_start = as.Date(min(df$start)) 
latest_end = as.Date(max(df$end[!is.na(df$end)]))

Total number of instances: 2
Total number of events/questions: 77
Examination period: 2021-02-02 - 2021-06-21

4 Grouped by Time

4.1 Events/Questions Started by Day

df_by_day <- df %>%
  mutate(start_date = as.Date(start)) %>%
  count(start_date, name = "count")

gg1 <- ggplot(df_by_day, aes(x = start_date, y = count)) +
  geom_line() +
  geom_smooth(alpha=0.5, colour="red", method="loess", se=F) +
  labs(title = "Number of Events/Questions Started by Day with Smoothed Regression Line", y =  "Number of Questions/Events Started", x = "Satrt Date") +
  theme_light() 
gg1

4.2 Questions/Events started by Weekday and Hour of the Day

df_wday_hour <- df %>%
  mutate(wday=wday(start, label=T, week_start = 1), hour=hour(start)) %>%
  count(wday, hour, name="count_wday_hour") %>%
  arrange(desc(wday))

theme_heatmap <- theme_light() +                 
  theme(panel.grid = element_blank(),            
        panel.border = element_blank(),          
        plot.title = element_text(face = "bold", size = 11, hjust = 0.5), 
        axis.ticks = element_blank(),            
        axis.title.x = element_blank(),        
        axis.title.y = element_text(size=10),   
        axis.text.y = element_text(size = 8),    
        axis.text.x = element_text(size = 10),   
        legend.position = "none")                

gg2 <- ggplot(df_wday_hour, aes(x = wday, y = hour, fill = count_wday_hour)) +
  geom_tile(colour="white") +  
  scale_fill_gradient(low = "#fff0f0", high="#940606") +  
  scale_y_reverse(breaks=c(23:0), labels=c(23:0), expand = c(0,0)) +               
  scale_x_discrete(expand = c(0,0), position = "top") +
  labs(title = "Number of Started Events/Questions by Day of Week / Hour of Day", y = "Hour of Day") +
  geom_text(aes(label = count_wday_hour), size = 2) +
  theme_heatmap  
gg2

4.3 Distribution of Time Spent per Event/Question with largest 5 % removed

df_clean = subset(df, time_spent<quantile(df$time_spent,0.95, na.rm=TRUE))

hist(df_clean$time_spent[!is.na(df_clean$time_spent)]/60, breaks=20, xlab = "Time Spent in Minutes", main = "Histogram of the Time Spent by Question")

5 Aggregated by Event/Question

5.1 Median Time Spent by Question

df_median_time_per_question <- df %>%
  filter(event=="question") %>%
  group_by(question_decoded) %>%
  summarise(median_time_spent = median(time_spent)) %>%
  arrange(desc(median_time_spent)) %>%
  mutate(median_time_spent = round(seconds_to_period(median_time_spent)))

df_median_time_per_question
question_decoded median_time_spent
Did you miss work to bring the child to the facility today? 2M 2S
Did you pay for something at the facility today? 2M 2S
Do you intend to buy some medicines outside of the facility? 2M 2S
Is this facility the closest health facility to your home? 2M 2S
Can you show me all the medicines and prescriptions that you received? 51S
Did the provider explain to you how to give these medicines to the child at home? 51S
How confident do you feel in how much of the medication to give each day and how many days to give it? 51S
Were you given general information or advice about feeding or breastfeeding? 48S
Were you informed of signs / symptoms that require you to bring the child back to the facility immediately? 48S
What do you intend to do if the sick child does not get completely better or become worse? 48S
Please scan the participant’s QR code 36S
Did the provider speak in a language you understand? 14S
Did you feel the provider treated you and the child with respect? 14S
Did you find the provider showed concern and empathy? 14S
Did you find the provider was kind to you? 14S
How do you feel overall with the service you received at the facility today? 14S
Was the service delayed or were you kept waiting for a long time? 14S
Would you recommend this facility to a friend / family with a sick child? 14S
If QR code scanning is not possible, please manually enter the participant identification code 11S
Please select the current district 10S
Did the provider give or prescribe any medicines for the child to take home? 6S
Did the provider refer the child? 6S
Did the provider tell you what illness your child has? 6S
Did the provider use the device that is represented in the following picture during the consultation of the child? 3S
Did the provider use a tablet like this one for the consultation of the child? 2S

5.2 Count of Input Changes and Median Time until Input was Changed by Question

df_changes_per_question <- df %>%
  filter(event=="question", 
         !is.na(time_till_change)) %>%
  group_by(question_decoded) %>%
  summarise(count_input_changes=n(), 
            median_time_till_change=median(time_till_change), 
            sd_time_till_change=sd(time_till_change)) %>%
  arrange(desc(count_input_changes)) %>%
  mutate(median_time_till_change = round(seconds_to_period(median_time_till_change)),
         sd_time_till_change = round(seconds_to_period(sd_time_till_change), 1)) %>%
  filter(count_input_changes > 1)

df_changes_per_question
question_decoded count_input_changes median_time_till_change sd_time_till_change

5.3 Count of Old-New Value Pairs

df_stream <- df %>%
  filter(!is.na(time_till_change)) %>%
  count(question_decoded, 
        old_value_decoded, 
        new_value_decoded, 
        name="count_value_pairs", 
        sort=TRUE) %>%
  filter(count_value_pairs > 1)

df_stream
question_decoded old_value_decoded new_value_decoded count_value_pairs

6 Aggregated by Instance

6.1 Top 10 % of Duration by Instance

df_duration_per_inst <- df %>%
  group_by(instance.ID) %>%
  summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>% 
  filter(duration_per_inst>quantile(duration_per_inst, 0.9, na.rm=TRUE)) %>%
  mutate(duration_per_inst = round(seconds_to_period(duration_per_inst))) %>%
  arrange(desc(duration_per_inst))

df_duration_per_inst
instance.ID duration_per_inst
uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 6d 17H 58M 60S

6.2 Distribution of Duration by Instance with Top 10 % excluded

df_subsetted <- df %>%
  group_by(instance.ID) %>%
  summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
  filter(duration_per_inst<quantile(duration_per_inst, 0.9, na.rm=TRUE))
 
hist(as.numeric(df_subsetted$duration_per_inst/60), breaks=30, main="Duration per Instance in Minutes (outliers removed)", xlab="Duration in Minutes")

7 Irregularities and Outliers

7.1 Time Till Change Outliers (for all data without removed outliers)

df_time_till_change_outliers <- df %>% 
  filter(time_till_change>quantile(df$time_till_change, 0.9, na.rm=TRUE)) %>% 
  arrange(desc(time_till_change)) %>%
  mutate(time_till_change = round(seconds_to_period(time_till_change))) %>%
  select(instance.ID, 
         question_decoded, 
         old_value_decoded, 
         new_value_decoded, 
         time_till_change)

df_time_till_change_outliers
instance.ID question_decoded old_value_decoded new_value_decoded time_till_change

7.2 Histograms of Instances with Inconsistent Filling Behaviour

irregular_inst = c()
for (id in unique(df$instance.ID)){
  bin_vec = cut(df$start[df$instance.ID==id], 
                breaks=10, 
                labels=F)
  if (length(unique(bin_vec)) < 5) irregular_inst = c(irregular_inst, id)
}
paste0(length(irregular_inst), " out of ", length(unique(df$instance.ID))," instances were found to have an inconsistent filling behaviour.")
## [1] "1 out of 2 instances were found to have an inconsistent filling behaviour."
last_bin_questions = c()
fig <- plot_ly(alpha=0.1)
for (id in irregular_inst){
  temp_df = df[df$instance.ID==id,]
  temp_df$cut = cut(temp_df$start, breaks=10, labels=c("1. Part", "2. Part", "3. Part", "4. Part", "5. Part", "6. Part", "7. Part", "8. Part", "9. Part", "10. Part"))
  fig <- fig %>% add_histogram(x=temp_df$cut, name=id)
  
  last_bin_questions = c(last_bin_questions, temp_df$question_decoded[temp_df$cut=="10. Part"])
}
fig <- fig %>% layout(barmode = "overlay")
fig
kable(table(last_bin_questions) %>% as.data.frame() %>% arrange(desc(Freq)))
last_bin_questions Freq
Facility identification 2
Participant identification 2
Can you show me all the medicines and prescriptions that you received? 1
Consultation satisfaction 1
Cost 1
Counselling and follow-up advice 1
Did the provider explain to you how to give these medicines to the child at home? 1
Did the provider give or prescribe any medicines for the child to take home? 1
Did the provider refer the child? 1
Did the provider speak in a language you understand? 1
Did the provider tell you what illness your child has? 1
Did you feel the provider treated you and the child with respect? 1
Did you find the provider showed concern and empathy? 1
Did you find the provider was kind to you? 1
Did you miss work to bring the child to the facility today? 1
Did you pay for something at the facility today? 1
Do you intend to buy some medicines outside of the facility? 1
front_page 1
How confident do you feel in how much of the medication to give each day and how many days to give it? 1
How do you feel overall with the service you received at the facility today? 1
Is this facility the closest health facility to your home? 1
Please scan the participant’s QR code 1
Please select the current district 1
Treatment 1
Was the service delayed or were you kept waiting for a long time? 1
Were you given general information or advice about feeding or breastfeeding? 1
Were you informed of signs / symptoms that require you to bring the child back to the facility immediately? 1
What do you intend to do if the sick child does not get completely better or become worse? 1
Would you recommend this facility to a friend / family with a sick child? 1